from NER.config.config import *

config = Config()


def build_data():
    datas = []
    sample_x = []
    sample_y = []
    vocab_list = ["PAD", 'UNK']
    for line in open(config.train_path, 'r', encoding='utf-8'):
        line = line.rstrip().split('\t')
        if not line:
            continue
        char = line[0]
        if not char:
            continue
        cate = line[-1]
        sample_x.append(char)
        sample_y.append(cate)
        if char not in vocab_list:
            vocab_list.append(char)
        if char in ['。', '?', '!', '！', '？']:
            datas.append([sample_x, sample_y])
            sample_x = []
            sample_y = []
    word2id = {wd: index for index, wd in enumerate(vocab_list)}
    write_file(vocab_list, config.vocab_path)
    return datas, word2id


def write_file(vocab_list, filepath):
    with open(filepath,'a',encoding='utf-8') as f:
       f.write('\n'.join(vocab_list))


if __name__ == '__main__':
    datas, word2id = build_data()
    print(datas)
    # print(word2id)
